Filter criteria:
Filter differentially expressed genes between autism and control (p-value < 0.05)
No samples are removed based on network connectivity z-scores
glue('Number of genes: ', nrow(datExpr), '\n',
'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 2928
## Number of samples: 86 (51 ASD, 35 controls)
First principal component explains over 89.7% of the total variance (lower than 90%…)
reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.98){
datExpr = data.frame(datExpr)
datExpr_pca = prcomp(datExpr, scale.=TRUE)
last_pc = data.frame(summary(datExpr_pca)$importance[3,]) %>% rownames_to_column(var='ID') %>%
filter(.[[2]] >= var_explained) %>% top_n(-1, ID)
par(mfrow=c(1,2))
plot(summary(datExpr_pca)$importance[2,], type='b')
abline(v=substr(last_pc$ID, 3, nchar(last_pc$ID)), col='blue')
plot(summary(datExpr_pca)$importance[3,], type='b')
abline(h=var_explained, col='blue')
print(glue('Keeping top ', substr(last_pc$ID, 3, nchar(last_pc$ID)), ' components that explain ',
var_explained*100, '% of the variance'))
datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:last_pc$ID)
return(list('datExpr'=datExpr_top_pc, 'pca_output'=datExpr_pca))
}
reduce_dim_output = reduce_dim_datExpr(datExpr, datMeta)
## Keeping top 11 components that explain 98% of the variance
datExpr_redDim = reduce_dim_output$datExpr
pca_output = reduce_dim_output$pca_output
rm(datSeq, datProbes, reduce_dim_datExpr, reduce_dim_output, datExpr)
Perhaps the first component is determined by a small number of samples and by removing them we can lower the variance explained by it:
plot(sort(pca_output$rotation[,1], decreasing = TRUE))
Genes are still separated into two clouds of points:
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2)) + geom_point() + theme_minimal()
Projecting all the original points into the space created by the two principal components and colouring by the differential expression p-value we can see that the points in the middle of the two clouds were filtered out because their DE wasn’t statistically significant. Colouring by their log2 fold change we can see that the genes from the cloud on the top are overexpressed and the genes in the bottom one underexpressed.
load('./../working_data/RNAseq_ASD_4region_normalized.Rdata')
# Remove Subject with ID = 'AN03345'
keep = datMeta$Subject_ID!='AN03345'
datMeta = datMeta[keep,]
datExpr = datExpr[,keep]
# Calculate differential expression p-value of each gene
mod = model.matrix(~ Dx, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr))
ordered_top_genes = top_genes[match(rownames(datExpr), rownames(top_genes)),]
ASD_pvals = ordered_top_genes$adj.P.Val
log_fold_change = ordered_top_genes$logFC
pca_data_projection = scale(datExpr) %*% pca_output$rotation %>% data.frame
p1 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=ASD_pvals)) + geom_point(alpha=0.5) + theme_minimal()
p2 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=log_fold_change)) + geom_point() + theme_minimal() +
scale_colour_gradient2()
grid.arrange(p1, p2, ncol=2)
rm(mod, corfit, lmfit, fit, ASD_pvals, log_fold_change, top_genes, ordered_top_genes, datExpr, datProbes, datSeq, p1, p2)
clusterings = list()
set.seed(123)
wss = sapply(1:10, function(k) kmeans(datExpr_redDim, k, iter.max=100, nstart=25,
algorithm='MacQueen')$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 3
abline(v = best_k, col='blue')
datExpr_k_means = kmeans(datExpr_redDim, best_k, iter.max=100, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster
Chose k=6 as best number of clusters.
Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.
Younger ASD samples seem to be more similar to control samples than older ASD samples (pink cluster has most of the youngest samples). The yellow cluster is made of young ASD samples.
h_clusts = datExpr_redDim %>% dist %>% hclust
plot(h_clusts, hang = -1, cex = 0.6, labels=FALSE)
best_k = 6
clusterings[['hc']] = cutree(h_clusts, best_k)
Samples are grouped into two big clusters and two outliers, the first big cluster has three main subclusters, two small subclusters and two outliers, and the second one two main subclustersand five small subclusters.
*Output plots in clustering_genes_03_20 folder
Following this paper’s guidelines:
Run PCA and keep enough components to explain 60% of the variance
Run ICA with that same number of nbComp as principal components kept to then filter them
Select components with kurtosis > 3
Assign genes to clusters with FDR<0.01 using the fdrtool package
ICA_output = datExpr_redDim %>% runICA(nbComp=ncol(datExpr_redDim), method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]
clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c
Leaves most of the observations (~80%) without a cluster (every time it leaves out more genes):
ICA_clusters %>% rowSums %>% table
## .
## 0 1 2 3 4 5
## 2355 425 112 27 8 1
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) +
geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') +
theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()
best_power=56 but blockwiseModules only accepts powers up to 30, so 30 was used instead
best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(20, 60, by=2))
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 20 0.238 -0.225 0.940 516 517 1130
## 2 22 0.302 -0.248 0.933 478 463 1080
## 3 24 0.366 -0.272 0.942 446 417 1040
## 4 26 0.436 -0.293 0.958 417 376 998
## 5 28 0.485 -0.314 0.949 391 339 963
## 6 30 0.522 -0.330 0.945 368 309 931
## 7 32 0.578 -0.352 0.967 347 282 901
## 8 34 0.625 -0.371 0.965 328 258 873
## 9 36 0.645 -0.384 0.954 311 236 847
## 10 38 0.688 -0.402 0.946 295 215 822
## 11 40 0.715 -0.415 0.938 281 200 799
## 12 42 0.740 -0.426 0.935 267 185 778
## 13 44 0.763 -0.442 0.930 255 172 757
## 14 46 0.787 -0.457 0.936 244 159 738
## 15 48 0.802 -0.469 0.933 233 148 719
## 16 50 0.809 -0.480 0.923 223 138 702
## 17 52 0.823 -0.493 0.924 214 128 685
## 18 54 0.839 -0.503 0.929 206 119 669
## 19 56 0.854 -0.515 0.933 198 111 654
## 20 58 0.858 -0.527 0.926 190 104 640
## 21 60 0.872 -0.534 0.937 183 98 626
network = datExpr_redDim %>% t %>% blockwiseModules(power=30, numericLabels=TRUE)
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)
It leaves 304 genes without a cluster:
clusterings[['WGCNA']] %>% table
## .
## 0 1 2 3 4
## 304 1742 528 227 127
Number of clusters that resemble more Gaussian mixtures = 39
n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')
best_k = 39
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)
Plot of clusters with their centroids in gray
gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
Trying with 27 clusters (27 has the 9th lowest value)
best_k = 27
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_2']] = gmm$Log_likelihood %>% apply(1, which.max)
clusterings[['GMM_2']] %>% table
## .
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 159 71 159 165 109 122 96 106 135 78 54 109 154 90 146 71 102 40
## 19 20 21 22 23 24 25 26 27
## 160 159 99 87 55 98 82 67 155
Trying with 6 clusters
best_k = 6
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_3']] = gmm$Log_likelihood %>% apply(1, which.max)
clusterings[['GMM_3']] %>% table
## .
## 1 2 3 4 5 6
## 662 455 366 684 431 330
Separate the two clouds of points by a straight line. There seems to be a difference in the mean expression of the genes between clusters but not in their standard deviation.
manual_clusters = as.factor(as.numeric(-0.1*datExpr_redDim$PC1 + 0.05 > datExpr_redDim$PC2))
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2, color=manual_clusters)) + geom_point() +
geom_abline(slope=-0.1, intercept=0.05, color='gray') + theme_minimal()
names(manual_clusters) = rownames(datExpr_redDim)
clusterings[['Manual']] = manual_clusters
Cluster 2 has a bigger mean expression than cluster 1. There also seem to be more than one distributions in cluster 2, with two different means and three different standard deviations?
manual_clusters_data = cbind(apply(datExpr_redDim, 1, mean), apply(datExpr_redDim, 1, sd),
manual_clusters) %>% data.frame
colnames(manual_clusters_data) = c('mean','sd','cluster')
manual_clusters_data = manual_clusters_data %>% mutate('cluster'=as.factor(cluster))
p1 = manual_clusters_data %>% ggplot(aes(x=mean, color=cluster, fill=cluster)) +
geom_density(alpha=0.4) + theme_minimal()
p2 = manual_clusters_data %>% ggplot(aes(x=sd, color=cluster, fill=cluster)) +
geom_density(alpha=0.4) + theme_minimal()
grid.arrange(p1, p2, ncol=2)
Separating cluster 2 genes by their sd:
c2_sd = manual_clusters_data %>% filter(cluster==2) %>% dplyr::select(sd)
rownames(c2_sd) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='2']
gmm_c2_sd = c2_sd %>% GMM(3)
manual_clusters_data %>% ggplot(aes(x=sd)) +
stat_function(fun=dnorm, n=100, colour='#99cc00', # green
args=list(mean=gmm_c2_sd$centroids[1], sd=gmm_c2_sd$covariance_matrices[1])) +
stat_function(fun=dnorm, n=100, colour='#2eb8b8', # acqua
args=list(mean=gmm_c2_sd$centroids[2], sd=gmm_c2_sd$covariance_matrices[2])) +
stat_function(fun=dnorm, n=100, colour='#a64dff', # purple
args=list(mean=gmm_c2_sd$centroids[3], sd=gmm_c2_sd$covariance_matrices[3])) +
theme_minimal()
clusterings[['Manual2']] = as.character(clusterings[['Manual']])
clusterings[['Manual2']][clusterings[['Manual']]==1] = gmm_c2_sd$Log_likelihood %>%
apply(1, function(x) glue('1_',which.max(x)))
clusterings[['Manual2']] %>% table
## .
## 0 1_1 1_2 1_3
## 1713 422 436 357
rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output,
ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network,
best_power, c, manual_clusters, manual_clusters_data, c2_sd, gmm_c2_sd, p1, p2,
pca_data_projection)
Using Adjusted Rand Index:
Clusterings seem to give very different results and none resembles the manual separation
Simple methods give similar results (K-means, hierarchical clustering, consensus clustering)
cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
cluster1 = as.factor(clusterings[[i]])
for(j in (i):length(clusterings)){
cluster2 = as.factor(clusterings[[j]])
cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
}
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)
cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none',
cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE,
cexRow = 1, cexCol = 1, margins = c(7,7))
rm(i, j, cluster1, cluster2, cluster_sim)
create_2D_plot = function(cat_var){
ggplotly(plot_points %>% ggplot(aes_string(x='PC1', y='PC2', color=cat_var)) +
geom_point() + theme_minimal() +
xlab(paste0('PC1 (', round(summary(pca_output)$importance[2,1]*100,2),'%)')) +
ylab(paste0('PC2 (', round(summary(pca_output)$importance[2,2]*100,2),'%)')))
}
create_3D_plot = function(cat_var){
plot_points %>% plot_ly(x=~PC1, y=~PC2, z=~PC3) %>% add_markers(color=plot_points[,cat_var], size=1) %>%
layout(title = glue('Samples coloured by ', cat_var),
scene = list(xaxis=list(title=glue('PC1 (',round(summary(pca_output)$importance[2,1]*100,2),'%)')),
yaxis=list(title=glue('PC2 (',round(summary(pca_output)$importance[2,2]*100,2),'%)')),
zaxis=list(title=glue('PC3 (',round(summary(pca_output)$importance[2,3]*100,2),'%)'))))
}
plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
mutate(ID = rownames(.), km_clust = as.factor(clusterings[['km']]),
hc_clust = as.factor(clusterings[['hc']]), cc_l1_clust = as.factor(clusterings[['cc_l1']]),
cc_clust = as.factor(clusterings[['cc_l2']]), ica_clust = as.factor(clusterings[['ICA_min']]),
n_ica_clust = as.factor(rowSums(ICA_clusters)), gmm_clust = as.factor(clusterings[['GMM']]),
gmm_clust2 = as.factor(clusterings[['GMM_2']]), gmm_clust3 = as.factor(clusterings[['GMM_3']]),
wgcna_clust = as.factor(clusterings[['WGCNA']]), manual_clust=as.factor(clusterings[['Manual']]),
manual2_clust=as.factor(clusterings[['Manual2']]))
Simple methods seem to only partition the space in buckets using information from the first component
All clusterings except K-means manage to separate the two clouds at least partially, but no one does a good job
WGCNA creates clusters inverted between clouds
Manual classification + GMM by standard deviation clusters make sense
create_2D_plot('km_clust')
create_2D_plot('hc_clust')
create_2D_plot('cc_l1_clust')
create_2D_plot('cc_clust')
create_2D_plot('ica_clust')
create_2D_plot('gmm_clust')
create_2D_plot('gmm_clust2')
create_2D_plot('gmm_clust3')
create_2D_plot('wgcna_clust')
create_2D_plot('manual2_clust')
create_3D_plot('ica_clust')
create_3D_plot('gmm_clust')
create_3D_plot('gmm_clust3')
create_3D_plot('wgcna_clust')
create_3D_plot('manual2_clust')